In [1]:
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from scipy.stats import zscore
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import lightgbm as lgb
import pickle
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV
from sklearn.tree import plot_tree
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import sys
import pydot
import warnings
warnings.simplefilter(action='ignore')
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn import metrics

1.Import and understand the data.

1.A. Import ‘signal-data.csv’ as DataFrame

In [3]:
df_signal = pd.read_csv('/content/drive/MyDrive/signal-data.csv')
df_signal.head()
Out[3]:
Time 0 1 2 3 4 5 6 7 8 ... 581 582 583 584 585 586 587 588 589 Pass/Fail
0 2008-07-19 11:55:00 3030.93 2564.00 2187.7333 1411.1265 1.3602 100.0 97.6133 0.1242 1.5005 ... NaN 0.5005 0.0118 0.0035 2.3630 NaN NaN NaN NaN -1
1 2008-07-19 12:32:00 3095.78 2465.14 2230.4222 1463.6606 0.8294 100.0 102.3433 0.1247 1.4966 ... 208.2045 0.5019 0.0223 0.0055 4.4447 0.0096 0.0201 0.0060 208.2045 -1
2 2008-07-19 13:17:00 2932.61 2559.94 2186.4111 1698.0172 1.5102 100.0 95.4878 0.1241 1.4436 ... 82.8602 0.4958 0.0157 0.0039 3.1745 0.0584 0.0484 0.0148 82.8602 1
3 2008-07-19 14:43:00 2988.72 2479.90 2199.0333 909.7926 1.3204 100.0 104.2367 0.1217 1.4882 ... 73.8432 0.4990 0.0103 0.0025 2.0544 0.0202 0.0149 0.0044 73.8432 -1
4 2008-07-19 15:22:00 3032.24 2502.87 2233.3667 1326.5200 1.5334 100.0 100.3967 0.1235 1.5031 ... NaN 0.4800 0.4766 0.1045 99.3032 0.0202 0.0149 0.0044 73.8432 -1

5 rows × 592 columns

In [4]:
print("The Shape of dataframe is", df_signal.shape)
The Shape of dataframe is (1567, 592)

1.B. Print 5 point summary and share at least 2 observations.

In [5]:
df_signal.describe().T
Out[5]:
count mean std min 25% 50% 75% max
0 1561.0 3014.452896 73.621787 2743.2400 2966.260000 3011.4900 3056.6500 3356.3500
1 1560.0 2495.850231 80.407705 2158.7500 2452.247500 2499.4050 2538.8225 2846.4400
2 1553.0 2200.547318 29.513152 2060.6600 2181.044400 2201.0667 2218.0555 2315.2667
3 1553.0 1396.376627 441.691640 0.0000 1081.875800 1285.2144 1591.2235 3715.0417
4 1553.0 4.197013 56.355540 0.6815 1.017700 1.3168 1.5257 1114.5366
... ... ... ... ... ... ... ... ...
586 1566.0 0.021458 0.012358 -0.0169 0.013425 0.0205 0.0276 0.1028
587 1566.0 0.016475 0.008808 0.0032 0.010600 0.0148 0.0203 0.0799
588 1566.0 0.005283 0.002867 0.0010 0.003300 0.0046 0.0064 0.0286
589 1566.0 99.670066 93.891919 0.0000 44.368600 71.9005 114.7497 737.3048
Pass/Fail 1567.0 -0.867262 0.498010 -1.0000 -1.000000 -1.0000 -1.0000 1.0000

591 rows × 8 columns

Observation:

  1. In Pass/Fail column, the pass count is less than 25% of all records. There is no null values in the Pass/Fail column.
  2. As per count column, data set contains null values.
  3. Time column is missing.
  4. If we compare the range in the describe table, the data range in the columns are very different. Standardzation should be must.
In [6]:
df_signal['Time'].info()
<class 'pandas.core.series.Series'>
RangeIndex: 1567 entries, 0 to 1566
Series name: Time
Non-Null Count  Dtype 
--------------  ----- 
1567 non-null   object
dtypes: object(1)
memory usage: 12.4+ KB
In [7]:
df_signal['Time'] = pd.to_datetime(df_signal['Time'])

2. Data cleansing:

2.A. Write a for loop which will remove all the features with 20%+ Null values and impute rest with mean of the feature.

In [8]:
df_signal.isna().sum()
Out[8]:
Time          0
0             6
1             7
2            14
3            14
             ..
586           1
587           1
588           1
589           1
Pass/Fail     0
Length: 592, dtype: int64
In [9]:
col_name=df_signal.columns
count=0
col_with_null_nalues = []
for i in col_name:
  if (((df_signal[i].isna().sum())/df_signal.shape[1])>0.2):
       count=count+1
       df_signal.drop(columns=[i], inplace=True)  ## Deleting the columns which has more than 20% null values
       col_with_null_nalues.append(i)
  else :
        df_signal[i].fillna(df_signal[i].mean(), inplace=True) #Imputing with mean
print("Numbers of columns deleted =", count)
print("Numbers of columns in the dataset", df_signal.shape[1])
print("Deleted columns:", col_with_null_nalues)
Numbers of columns deleted = 52
Numbers of columns in the dataset 540
Deleted columns: ['72', '73', '85', '109', '110', '111', '112', '157', '158', '220', '244', '245', '246', '247', '292', '293', '345', '346', '358', '382', '383', '384', '385', '492', '516', '517', '518', '519', '546', '547', '548', '549', '550', '551', '552', '553', '554', '555', '556', '557', '562', '563', '564', '565', '566', '567', '568', '569', '578', '579', '580', '581']
In [10]:
df_signal.head()
Out[10]:
Time 0 1 2 3 4 5 6 7 8 ... 577 582 583 584 585 586 587 588 589 Pass/Fail
0 2008-07-19 11:55:00 3030.93 2564.00 2187.7333 1411.1265 1.3602 100.0 97.6133 0.1242 1.5005 ... 14.9509 0.5005 0.0118 0.0035 2.3630 0.021458 0.016475 0.005283 99.670066 -1
1 2008-07-19 12:32:00 3095.78 2465.14 2230.4222 1463.6606 0.8294 100.0 102.3433 0.1247 1.4966 ... 10.9003 0.5019 0.0223 0.0055 4.4447 0.009600 0.020100 0.006000 208.204500 -1
2 2008-07-19 13:17:00 2932.61 2559.94 2186.4111 1698.0172 1.5102 100.0 95.4878 0.1241 1.4436 ... 9.2721 0.4958 0.0157 0.0039 3.1745 0.058400 0.048400 0.014800 82.860200 1
3 2008-07-19 14:43:00 2988.72 2479.90 2199.0333 909.7926 1.3204 100.0 104.2367 0.1217 1.4882 ... 8.5831 0.4990 0.0103 0.0025 2.0544 0.020200 0.014900 0.004400 73.843200 -1
4 2008-07-19 15:22:00 3032.24 2502.87 2233.3667 1326.5200 1.5334 100.0 100.3967 0.1235 1.5031 ... 10.9698 0.4800 0.4766 0.1045 99.3032 0.020200 0.014900 0.004400 73.843200 -1

5 rows × 540 columns

2.B. Identify and drop the features which are having same value for all the rows.

In [11]:
col_name2=df_signal.columns
q=[]
for j in col_name2:
  if(len(df_signal[j].value_counts())==1):
    q.append(j)
    df_signal.drop(columns=[j], inplace=True, axis=1)

print("Numbers of column in the dataset", df_signal.shape[1])
print("Columns awith same data in all the rows:", q)
Numbers of column in the dataset 424
Columns awith same data in all the rows: ['5', '13', '42', '49', '52', '69', '97', '141', '149', '178', '179', '186', '189', '190', '191', '192', '193', '194', '226', '229', '230', '231', '232', '233', '234', '235', '236', '237', '240', '241', '242', '243', '256', '257', '258', '259', '260', '261', '262', '263', '264', '265', '266', '276', '284', '313', '314', '315', '322', '325', '326', '327', '328', '329', '330', '364', '369', '370', '371', '372', '373', '374', '375', '378', '379', '380', '381', '394', '395', '396', '397', '398', '399', '400', '401', '402', '403', '404', '414', '422', '449', '450', '451', '458', '461', '462', '463', '464', '465', '466', '481', '498', '501', '502', '503', '504', '505', '506', '507', '508', '509', '512', '513', '514', '515', '528', '529', '530', '531', '532', '533', '534', '535', '536', '537', '538']
In [12]:
print(df_signal.shape)
(1567, 424)

2.C Drop other features if required using relevant functional knowledge. Clearly justify the same.

Check for the duplicate rows

In [13]:
df_signal.duplicated().sum()
Out[13]:
0

As mention in the data description of the project, time is used only for data time stamp for that specific test point. So we can drop it.

Target column “ –1” corresponds to a pass and “1” corresponds to a fail. Update the pass as 0 and fail as 1.

In [14]:
df_signal = df_signal.drop('Time', axis=1)
In [15]:
df_signal['Pass/Fail'] = df_signal['Pass/Fail'].replace({ -1:0,  1:1})
df_signal['Pass/Fail'] = df_signal['Pass/Fail'].astype(int)
In [16]:
df_signal.sample(5)
Out[16]:
0 1 2 3 4 6 7 8 9 10 ... 577 582 583 584 585 586 587 588 589 Pass/Fail
399 2996.53 2533.52 2202.1222 1034.5674 0.7760 104.6156 0.1219 1.5320 0.0068 -0.0173 ... 8.3030 0.5022 0.0156 0.0037 3.1083 0.0120 0.0104 0.0036 86.7035 0
44 3047.78 2490.71 2166.5222 907.0746 1.0647 104.5211 0.1221 1.5764 -0.0219 -0.0080 ... 8.2142 0.5009 0.0155 0.0041 3.0904 0.0149 0.0158 0.0054 106.1812 0
730 3136.34 2442.45 2250.7445 996.4071 0.8572 106.2956 0.1172 1.4285 -0.0222 -0.0006 ... 13.2632 0.5022 0.0173 0.0040 3.4516 0.0335 0.0084 0.0030 25.1494 0
618 2993.11 2498.91 2171.8556 940.9917 1.2906 103.4733 0.1234 1.4701 -0.0181 -0.0010 ... 14.0761 0.4959 0.0126 0.0037 2.5402 0.0227 0.0149 0.0052 65.4831 0
651 2978.62 2478.81 2236.0667 1680.1825 1.4834 98.6889 0.1221 1.4149 -0.0045 0.0085 ... 7.6125 0.4993 0.0121 0.0029 2.4170 0.0297 0.0115 0.0040 38.7106 0

5 rows × 423 columns

Check for zero standard deviation

In [17]:
X = df_signal.drop('Pass/Fail',axis=1)
#Test data
y=df_signal['Pass/Fail']
In [18]:
#check zero stanadard deviation in the columns
zero_std_columns = []

for column in X.columns:
    std_dev = X[column].std()
    std_dev= std_dev.round()
    if std_dev == 0:
        zero_std_columns.append(column)


X = X.drop(zero_std_columns, axis=1)
print("Columns with zero standard deviation so deleting it:", zero_std_columns)
print("Numbers of column in the dataset", X.shape[1])
Columns with zero standard deviation so deleting it: ['7', '8', '9', '10', '11', '17', '19', '20', '25', '26', '29', '30', '37', '38', '44', '47', '53', '54', '56', '57', '58', '61', '74', '75', '76', '77', '78', '79', '80', '81', '82', '84', '86', '87', '89', '91', '92', '93', '94', '95', '96', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '113', '114', '116', '118', '119', '120', '121', '123', '124', '125', '126', '127', '128', '130', '131', '132', '143', '144', '145', '146', '147', '153', '156', '163', '164', '165', '168', '169', '170', '171', '172', '173', '174', '175', '176', '177', '181', '184', '195', '206', '210', '211', '212', '213', '214', '215', '216', '217', '219', '221', '222', '224', '227', '228', '238', '239', '248', '249', '251', '253', '254', '255', '267', '278', '279', '280', '281', '282', '288', '290', '291', '298', '299', '300', '301', '302', '303', '304', '305', '306', '307', '308', '309', '310', '311', '312', '317', '320', '331', '334', '342', '347', '348', '349', '350', '351', '352', '353', '354', '355', '356', '357', '359', '360', '362', '365', '366', '367', '368', '376', '377', '386', '387', '389', '391', '392', '393', '405', '407', '441', '443', '444', '445', '446', '447', '448', '542', '543', '544', '558', '559', '560', '571', '573', '575', '582', '583', '584', '586', '587', '588']
Numbers of column in the dataset 229

Deleted the columns which has zero standar deviation. Because if any feature contains zero stanad deviation means it is Non-Informative Feature. And non-informative features can lead to overfitting.

2.D Check for multi-collinearity in the data and take necessary action. 2.E Make all relevant modifications on the data using both functional/logical reasoning/assumptions.

In [19]:
X.corr()
Out[19]:
0 1 2 3 4 6 12 14 15 16 ... 541 545 561 570 572 574 576 577 585 589
0 1.000000 -0.143840 0.004756 -0.007613 -0.011014 0.002270 0.010368 -0.007058 0.030675 -0.005749 ... 0.034221 -0.015287 0.037917 -0.018953 0.013678 0.015206 0.013228 0.008601 0.023589 0.004174
1 -0.143840 1.000000 0.005767 -0.007568 -0.001636 -0.025564 0.034062 -0.037667 -0.087315 -0.001878 ... -0.015439 0.040333 -0.025492 -0.009000 0.001753 0.001303 0.002570 -0.010145 0.002273 0.044797
2 0.004756 0.005767 1.000000 0.298935 0.095891 -0.136225 0.018326 0.006476 0.006115 -0.000788 ... -0.004180 0.025334 0.025862 -0.037070 -0.000518 0.001342 0.002592 -0.028705 0.015752 -0.032890
3 -0.007613 -0.007568 0.298935 1.000000 -0.058483 -0.685835 -0.028223 -0.019827 -0.013157 -0.004596 ... 0.024721 0.046897 0.014912 0.002231 0.007634 0.006822 0.008216 0.016438 0.026019 -0.080341
4 -0.011014 -0.001636 0.095891 -0.058483 1.000000 -0.074368 -0.002707 -0.017523 0.011435 -0.001763 ... -0.044442 0.057173 -0.025806 0.005273 -0.012024 -0.012264 -0.012163 -0.004070 -0.001616 0.050910
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
574 0.015206 0.001303 0.001342 0.006822 -0.012264 0.007783 0.032908 0.000409 -0.024032 -0.014005 ... 0.033459 0.060316 -0.046796 -0.307529 0.993689 1.000000 0.991738 0.851784 -0.016812 -0.020471
576 0.013228 0.002570 0.002592 0.008216 -0.012163 0.007409 0.035743 -0.000985 -0.023509 -0.014167 ... 0.033358 0.056812 -0.046402 -0.360498 0.994772 0.991738 1.000000 0.859278 -0.017147 -0.022567
577 0.008601 -0.010145 -0.028705 0.016438 -0.004070 -0.012342 0.031434 0.009505 -0.019152 -0.004396 ... 0.051837 0.061725 -0.052455 -0.247655 0.863768 0.851784 0.859278 1.000000 -0.023910 -0.024766
585 0.023589 0.002273 0.015752 0.026019 -0.001616 -0.039517 0.000523 0.002535 0.017745 0.002643 ... 0.009112 -0.025979 0.023803 0.010143 -0.017179 -0.016812 -0.017147 -0.023910 1.000000 -0.003800
589 0.004174 0.044797 -0.032890 -0.080341 0.050910 0.043777 -0.036720 0.068161 0.009764 -0.013918 ... -0.070310 -0.061836 0.020439 -0.010583 -0.022672 -0.020471 -0.022567 -0.024766 -0.003800 1.000000

229 rows × 229 columns

Dropping the columns which has equal and more than 95% correlation between features.

In [20]:
correlation_matrix = X.corr()

threshold = 0.95
almost_same_correlations = {}
s=set()
# Iterate through the correlation matrix
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        correlation = correlation_matrix.iloc[i, j]
        if abs(correlation) >= threshold:
            col1 = correlation_matrix.columns[i]
            col2 = correlation_matrix.columns[j]
            if col1 not in almost_same_correlations:
                almost_same_correlations[col1] = [col2]

            else:
                almost_same_correlations[col1].append(col2)

print(almost_same_correlations)
# Extract the values (lists of column names) and flatten them
columns_to_drop = []
for cols in almost_same_correlations.values():
    for col in cols:
        columns_to_drop.append(col)

# Drop the columns from the DataFrame
X = X.drop(columns=columns_to_drop, axis=1)
{'36': ['34'], '140': ['4'], '148': ['16'], '152': ['16', '148'], '252': ['117'], '271': ['136'], '272': ['137'], '274': ['139'], '275': ['4', '140'], '277': ['142'], '283': ['16', '148', '152'], '285': ['150'], '286': ['151'], '287': ['16', '148', '152', '283'], '289': ['154'], '294': ['159'], '295': ['160'], '296': ['161'], '297': ['162'], '318': ['182'], '319': ['183'], '321': ['185'], '323': ['187'], '324': ['188'], '332': ['196'], '333': ['197'], '335': ['199', '332'], '338': ['202'], '339': ['203'], '340': ['204'], '341': ['205'], '343': ['207'], '344': ['208'], '361': ['223'], '363': ['225'], '388': ['250'], '390': ['117', '252'], '406': ['268'], '408': ['135'], '409': ['136', '271'], '410': ['137', '272'], '411': ['138'], '415': ['142', '277'], '421': ['16', '148', '152', '154', '283', '287', '289'], '424': ['151', '286'], '425': ['148', '152', '283', '287', '421'], '427': ['148', '154', '283', '289', '421'], '428': ['155'], '435': ['430', '434'], '436': ['430', '434', '435'], '437': ['166'], '452': ['180'], '454': ['182', '318'], '455': ['183', '319'], '457': ['185', '321'], '459': ['187', '323'], '469': ['197', '333'], '470': ['198'], '475': ['203', '339'], '477': ['205', '341'], '478': ['209'], '479': ['207', '343'], '490': ['218'], '495': ['223', '361'], '497': ['225', '363'], '522': ['250', '388'], '524': ['117', '252', '390'], '540': ['268', '406'], '541': ['269'], '574': ['572'], '576': ['572', '574']}

Dropping the features that have less correlation with target Pass/Fail column.

In [21]:
#combining the independent and dependent vaiables tocheck corrlation with target variable
X['Pass/Fail']=y
plt.figure(figsize=(12, 10))
sns.heatmap(X.corr()[['Pass/Fail']].sort_values(by='Pass/Fail', ascending=True), cmap='coolwarm', linewidths=.5,fmt=".2f", vmin=-1, vmax=1 )
plt.title("Correlation Heatmap")
plt.show()
In [22]:
X.shape
Out[22]:
(1567, 157)

Dropping the column which has less than .06 correlation .

In [23]:
correlation_threshold = 0.06
target_variable = 'Pass/Fail'
columns_to_drop = []

for column in X.columns:
    if column != target_variable:
        correlation = X[column].corr(X[target_variable])
        if abs(correlation) < correlation_threshold:
            columns_to_drop.append(column)

X = X.drop(columns_to_drop, axis=1)
X.head()
Out[23]:
14 21 22 28 33 59 64 122 129 133 ... 294 295 316 431 436 437 452 460 510 Pass/Fail
0 7.9558 -5419.00 2916.50 64.2333 9.5126 -1.7264 21.7264 2.639 -0.0473 1000.7263 ... 418.1363 398.3185 6.2698 33.1562 3.1158 3.1136 5.9396 29.9394 64.6707 0
1 10.1548 -5441.50 2604.25 68.4222 9.7997 0.8073 19.1927 2.541 -0.0946 998.1081 ... 233.9865 26.5879 5.6522 2.2655 1.6779 3.2153 5.1072 40.4475 141.4365 0
2 9.5157 -5447.75 2701.75 67.1333 8.6590 23.8245 16.1755 2.882 -0.1892 998.4440 ... 251.4536 329.6406 5.7247 29.1663 0.8972 3.1281 4.8795 32.3594 240.7767 1
3 9.6052 -5468.25 2648.25 62.9333 8.6789 24.3791 15.6209 3.132 0.2838 980.4510 ... 415.5048 157.0889 5.4440 13.4051 1.3671 2.7013 4.4680 27.6824 113.5593 0
4 10.5661 -5476.25 2635.25 62.8333 8.7677 -12.2945 32.2945 3.148 -0.5677 993.1274 ... 319.1252 128.0296 4.8956 10.7390 1.5533 6.2069 4.3131 30.8924 148.0663 0

5 rows × 21 columns

In [24]:
#listing all the correlation with target.
a=X.corr()
a['Pass/Fail']
Out[24]:
14          -0.068975
21           0.107997
22          -0.073380
28          -0.106767
33           0.080945
59           0.155771
64           0.076551
122         -0.078362
129          0.103351
133          0.067789
200          0.060595
294          0.081761
295          0.091831
316         -0.089410
431          0.120304
436          0.106426
437          0.069692
452         -0.077100
460          0.060587
510          0.131587
Pass/Fail    1.000000
Name: Pass/Fail, dtype: float64
In [25]:
y=X['Pass/Fail']
 X=X.drop('Pass/Fail',axis=1)

Apply forward feature selection.

In [26]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
sfs1 = SFS(DecisionTreeClassifier(random_state=1),
           k_features=10,
           forward=True,
           floating=False,
           verbose=2,
           scoring='r2')

sfs1 = sfs1.fit(np.array(X_train), y_train)
[2023-10-22 08:32:14] Features: 1/10 -- score: -0.0711083437110834
[2023-10-22 08:32:14] Features: 2/10 -- score: -0.3350809464508094
[2023-10-22 08:32:15] Features: 3/10 -- score: -0.3074252801992528
[2023-10-22 08:32:15] Features: 4/10 -- score: -0.8295662100456619
[2023-10-22 08:32:15] Features: 5/10 -- score: -0.6444146948941469
[2023-10-22 08:32:16] Features: 6/10 -- score: -0.607491697799917
[2023-10-22 08:32:17] Features: 7/10 -- score: -0.720601909506019
[2023-10-22 08:32:17] Features: 8/10 -- score: -0.6762162723121626
[2023-10-22 08:32:18] Features: 9/10 -- score: -0.585640307181403
[2023-10-22 08:32:18] Features: 10/10 -- score: -0.6460730593607306

Creating a new dataframe with selected 10 independent features.

In [27]:
new_df = X.iloc[:, list(sfs1.k_feature_idx_)]
new_df['Pass/Fail'] = y
new_df.head()
Out[27]:
28 59 64 122 129 133 200 295 316 460 Pass/Fail
0 64.2333 -1.7264 21.7264 2.639 -0.0473 1000.7263 10.30 398.3185 6.2698 29.9394 0
1 68.4222 0.8073 19.1927 2.541 -0.0946 998.1081 8.02 26.5879 5.6522 40.4475 0
2 67.1333 23.8245 16.1755 2.882 -0.1892 998.4440 16.73 329.6406 5.7247 32.3594 1
3 62.9333 24.3791 15.6209 3.132 0.2838 980.4510 13.56 157.0889 5.4440 27.6824 0
4 62.8333 -12.2945 32.2945 3.148 -0.5677 993.1274 19.77 128.0296 4.8956 30.8924 0

3. Data analysis & visualisation

3.A Perform a detailed univariate Analysis with appropriate detailed comments after each analysis.

In [28]:
def hist_plot(df):
  cat_columns = df.columns
  ncols = 3
  nrows = len(cat_columns) // ncols + (len(cat_columns) % ncols > 0)
  plt.figure(figsize=(30,30))
  for i, j in enumerate(cat_columns):
    plt.subplot(nrows, ncols, i+1)
    ax=sns.histplot(x=j,data=df,hue='Pass/Fail')
    ax.set_title("Histogram of "+j)
  return plt.show()
In [29]:
new_df['Pass/Fail'] = new_df['Pass/Fail'].astype('category')
new_df['Pass/Fail'].value_counts()
Out[29]:
0    1463
1     104
Name: Pass/Fail, dtype: int64
In [30]:
new_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567 entries, 0 to 1566
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   28         1567 non-null   float64 
 1   59         1567 non-null   float64 
 2   64         1567 non-null   float64 
 3   122        1567 non-null   float64 
 4   129        1567 non-null   float64 
 5   133        1567 non-null   float64 
 6   200        1567 non-null   float64 
 7   295        1567 non-null   float64 
 8   316        1567 non-null   float64 
 9   460        1567 non-null   float64 
 10  Pass/Fail  1567 non-null   category
dtypes: category(1), float64(10)
memory usage: 124.2 KB
In [31]:
hist_plot(new_df)
In [32]:
def  box_plot(df):
  cat_columns = []
  for i in df.columns:
    if (((df[i].dtype.name) == 'float64') ):
      cat_columns.append(i)
  ncols = 3
  nrows = len(cat_columns) // ncols + (len(cat_columns) % ncols > 0)
  plt.figure(figsize=(30,30))
  for i, j in enumerate(cat_columns):
    plt.subplot(nrows, ncols, i+1)
    ax=sns.boxplot(X[j],color='blue')
    ax.set_title("Histogram of "+j)
  return plt.show()
In [33]:
box_plot(new_df)

Analysis:

  1. The dataset contails outliers.
  2. In Dataset ,the pass and fail results count is not equal.
  3. Right side skewness on features - 200,295,316,460
  4. Left Side skewness on features -129, 133

Adjust Outliers.

In [34]:
for i in new_df.columns:
    if (((new_df[i].dtype.name) == 'float64') ):
      Q1=X[i].quantile(0.25)
      Q3=X[i].quantile(0.75)
      IQR=Q3-Q1
      lower = Q1 - 1.5*IQR
      upper = Q3 + 1.5*IQR
      median = X[i].median()
      new_df[i][new_df[i] < lower] = median
      new_df[i][new_df[i] > upper] = median

Perform bivariate and multivariate analysis with appropriate detailed comments after each analysis

In [35]:
def  bar_plot(df):
  cat_columns = []
  for i in df.columns:
    if (((df[i].dtype.name) == 'float64')):
       cat_columns.append(i)
  ncols = 3
  nrows = len(cat_columns) // ncols + (len(cat_columns) % ncols > 0)
  plt.figure(figsize=(30,30))
  for i, j in enumerate(cat_columns):
        plt.subplot(nrows, ncols, i + 1)
        ax = sns.barplot(x=j, y='Pass/Fail', data=df)
        ax.set_title("Bar Plot of " + j)

  plt.tight_layout()
  plt.show()
In [36]:
bar_plot(new_df)
In [37]:
def  scatter_plot(df):
  cat_columns = []
  for i in df.columns:
    if (((df[i].dtype.name) == 'float64')):
       cat_columns.append(i)
  ncols = 3
  nrows = len(cat_columns) // ncols + (len(cat_columns) % ncols > 0)
  plt.figure(figsize=(30,30))
  for i, j in enumerate(cat_columns):
        plt.subplot(nrows, ncols, i + 1)
        ax = sns.pointplot(x=j, y='Pass/Fail', data=df)
        ax.set_title("scatter Plot of " + j)

  plt.tight_layout()
  plt.show()
In [38]:
scatter_plot(new_df)

Analysis -

  1. Outliers still exists.
  2. For both pass and fail data, the central tendency is not points are very diferent.

Multi Variate plot

In [39]:
sns.pairplot(new_df, hue ='Pass/Fail', corner=True)
Out[39]:
<seaborn.axisgrid.PairGrid at 0x7aa966951c60>
In [40]:
sns.heatmap(new_df.corr(),vmin=-1, vmax=1 )
plt.title("Correlation Heatmap")
plt.show()

Analysis:

  1. data set is imbalanced.
  2. Most of the columns are not correlated.
  3. Some of the colmuns data is not normalized. Forex- feature 122 has mutiple peaks.
  4. Featur 59 and 64 has negative correlation.

4. Data pre-processing

4.A Segregate predictors vs target attributes

In [41]:
predictor = new_df.drop('Pass/Fail',axis=1)
target=new_df['Pass/Fail']
  1. Check for target balancing and fix it if found imbalanced
  2. Perform train-test split and standardise the data or vice versa if required.
In [42]:
target.value_counts()
Out[42]:
0    1463
1     104
Name: Pass/Fail, dtype: int64

The Target is not balanced.

In [43]:
#RandomOverSampler
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
ROS =  RandomOverSampler()
X_ros1, y_ros1 = ROS.fit_resample(predictor,target)
X_train_ros, X_test_ros, y_train_ros, y_test_ros = train_test_split(X_ros1, y_ros1, test_size=0.25, random_state=1)
print('Original dataset shape {}'.format(Counter(target)))
print('After Over sampling shape {}'.format(Counter(y_ros1)))
print('Resampled dataset shape {}'.format(Counter(y_train_ros)))
print('Resampled dataset shape {}'.format(Counter(y_test_ros)))
Original dataset shape Counter({0: 1463, 1: 104})
After Over sampling shape Counter({0: 1463, 1: 1463})
Resampled dataset shape Counter({1: 1101, 0: 1093})
Resampled dataset shape Counter({0: 370, 1: 362})
In [44]:
#RandomUnderSampler
from imblearn.under_sampling import RandomUnderSampler
undersample = RandomUnderSampler()
X_train_rus, y_train_rus = undersample.fit_resample(predictor,target)
X_train_rus, X_test_rus, y_train_rus, y_test_rus = train_test_split(X_train_rus, y_train_rus, test_size=0.25, random_state=5)
print('Original dataset shape {}'.format(Counter(target)))
print('After Over sampling shape {}'.format(Counter(y_train_rus)))
print('Resampled dataset shape {}'.format(Counter(y_train_rus)))
print('Resampled dataset shape {}'.format(Counter(y_test_rus)))
Original dataset shape Counter({0: 1463, 1: 104})
After Over sampling shape Counter({0: 83, 1: 73})
Resampled dataset shape Counter({0: 83, 1: 73})
Resampled dataset shape Counter({1: 31, 0: 21})

Scaling

In [45]:
X_train_rus_scaled  = X_train_rus.apply(zscore)
X_train_rus_scaled.sample(5)
Out[45]:
28 59 64 122 129 133 200 295 316 460
49 -1.664801 -2.413144 2.106716 -0.727778 0.095865 -0.069291 -0.477063 -1.179800 0.947756 -0.928872
43 1.137049 -0.337824 0.000450 1.100253 0.250467 0.327205 2.247847 -0.792831 -0.245084 -0.730333
10 0.188612 0.305372 0.219912 -0.392921 0.173166 -1.282260 -0.033754 2.828865 -0.233635 1.687242
97 0.415713 -0.042034 -0.299752 0.665070 -0.213339 -1.232266 1.037035 -0.298007 0.685225 -0.486726
30 -0.950167 -0.182126 -2.163097 -0.512792 1.410145 0.716021 -1.060306 0.826613 -1.186502 -0.190533
In [46]:
X_train_ros_scaled  = X_train_ros.apply(zscore)
X_train_ros_scaled.sample(5)
Out[46]:
28 59 64 122 129 133 200 295 316 460
2168 0.595146 -1.106932 0.793389 -1.221683 -2.848394 -1.542294 -0.377662 -0.031744 0.511652 -1.579579
1797 -0.155191 -0.114409 -1.192452 -1.397890 0.123528 1.151438 0.433035 -1.065471 -1.871358 -0.233724
2819 -0.705625 -0.114409 1.625472 -0.628075 1.015403 -0.487242 0.171676 0.308400 1.793004 0.826359
2857 -0.250203 -0.114409 0.460314 -0.790536 0.272148 0.192397 -1.505377 -0.982130 -0.740909 -0.005067
1219 -0.466443 -0.714622 0.398440 -1.030479 -0.025091 -1.158400 0.125696 -1.562560 -0.013861 -0.527385

4.D Check if the train and test data have similar statistical characteristics when compared with original data.

In [47]:
X_train_ros_scaled.describe()
Out[47]:
28 59 64 122 129 133 200 295 316 460
count 2.194000e+03 2.194000e+03 2.194000e+03 2.194000e+03 2.194000e+03 2.194000e+03 2.194000e+03 2.194000e+03 2.194000e+03 2.194000e+03
mean -3.037781e-15 3.238572e-18 6.962930e-17 -7.740188e-16 6.477144e-18 -3.270958e-15 3.400501e-16 1.465454e-16 6.517627e-16 -6.477144e-17
std 1.000228e+00 1.000228e+00 1.000228e+00 1.000228e+00 1.000228e+00 1.000228e+00 1.000228e+00 1.000228e+00 1.000228e+00 1.000228e+00
min -2.576529e+00 -3.021212e+00 -2.531580e+00 -2.617598e+00 -3.442244e+00 -3.047521e+00 -2.964631e+00 -2.469624e+00 -2.370397e+00 -2.058927e+00
25% -5.975201e-01 -4.685990e-01 -6.817915e-01 -7.839754e-01 -4.711075e-01 -6.418430e-01 -6.626157e-01 -6.672683e-01 -6.321166e-01 -6.746673e-01
50% -5.032971e-02 -1.144091e-01 4.984731e-02 -1.406923e-01 1.235283e-01 -5.021698e-02 -5.096330e-02 -1.051319e-01 -8.589300e-02 -2.234368e-01
75% 6.303629e-01 3.897467e-01 5.786758e-01 6.703631e-01 4.950775e-01 8.315572e-01 6.284489e-01 6.802064e-01 5.257164e-01 5.516281e-01
max 2.669168e+00 3.033921e+00 2.753641e+00 2.918573e+00 2.427133e+00 2.739132e+00 2.732025e+00 3.255473e+00 2.891353e+00 2.961686e+00
In [48]:
X_test_ros_scaled  = X_test_ros.apply(zscore)
X_test_ros_scaled.describe()
Out[48]:
28 59 64 122 129 133 200 295 316 460
count 7.320000e+02 7.320000e+02 7.320000e+02 7.320000e+02 7.320000e+02 7.320000e+02 7.320000e+02 7.320000e+02 7.320000e+02 7.320000e+02
mean 1.004661e-15 3.882747e-17 5.581449e-16 -1.613767e-16 9.706868e-18 -1.344644e-14 -4.222488e-16 -3.621875e-16 1.048342e-15 4.610762e-17
std 1.000684e+00 1.000684e+00 1.000684e+00 1.000684e+00 1.000684e+00 1.000684e+00 1.000684e+00 1.000684e+00 1.000684e+00 1.000684e+00
min -2.493913e+00 -2.865051e+00 -2.358150e+00 -2.703983e+00 -3.373055e+00 -2.859493e+00 -2.839141e+00 -2.214331e+00 -2.321334e+00 -1.956372e+00
25% -5.294866e-01 -5.095051e-01 -6.678818e-01 -8.463462e-01 -5.324393e-01 -6.553393e-01 -6.911868e-01 -7.600053e-01 -6.389683e-01 -6.586029e-01
50% -2.639759e-02 -1.420608e-01 1.807078e-02 -3.500927e-02 1.235258e-01 -8.915859e-02 1.229159e-02 -4.971446e-02 -1.108107e-01 -2.106576e-01
75% 6.555514e-01 3.873871e-01 5.802138e-01 6.685821e-01 4.149973e-01 7.951738e-01 6.600779e-01 7.363206e-01 5.444727e-01 5.011955e-01
max 2.358038e+00 2.748725e+00 2.645270e+00 3.027301e+00 2.382430e+00 2.712791e+00 2.704269e+00 2.842560e+00 2.980978e+00 2.946360e+00
In [49]:
X_train_rus_scaled.describe()
Out[49]:
28 59 64 122 129 133 200 295 316 460
count 1.560000e+02 1.560000e+02 1.560000e+02 1.560000e+02 1.560000e+02 1.560000e+02 1.560000e+02 1.560000e+02 1.560000e+02 1.560000e+02
mean 3.905708e-15 -7.401487e-17 5.978124e-17 7.970832e-17 4.554761e-17 -5.921189e-16 3.273735e-17 -7.187982e-17 8.540177e-16 2.846726e-16
std 1.003221e+00 1.003221e+00 1.003221e+00 1.003221e+00 1.003221e+00 1.003221e+00 1.003221e+00 1.003221e+00 1.003221e+00 1.003221e+00
min -2.593190e+00 -2.541188e+00 -2.304132e+00 -2.331703e+00 -3.381372e+00 -2.836720e+00 -2.569738e+00 -2.118678e+00 -2.062627e+00 -1.938437e+00
25% -5.736220e-01 -3.892945e-01 -6.483117e-01 -8.079093e-01 -5.227067e-01 -6.472617e-01 -6.246233e-01 -7.648309e-01 -6.646130e-01 -6.890714e-01
50% -5.183300e-02 -1.821258e-01 -9.624247e-03 -6.979107e-02 9.586462e-02 -6.377202e-02 -7.812490e-02 -4.171869e-03 -1.901067e-01 -2.253345e-01
75% 6.720203e-01 3.160072e-01 5.146136e-01 7.432465e-01 4.050686e-01 7.650487e-01 6.526782e-01 6.252778e-01 5.598536e-01 3.850439e-01
max 2.118848e+00 2.904413e+00 2.772568e+00 2.837079e+00 2.492195e+00 2.581711e+00 2.653784e+00 2.828865e+00 2.651490e+00 3.009957e+00
In [50]:
p=predictor.apply(zscore)
p.describe()
Out[50]:
28 59 64 122 129 133 200 295 316 460
count 1.567000e+03 1567.000000 1.567000e+03 1.567000e+03 1.567000e+03 1.567000e+03 1.567000e+03 1.567000e+03 1.567000e+03 1.567000e+03
mean -1.414737e-15 0.000000 -8.161946e-17 -3.355467e-16 1.813766e-17 1.988794e-14 -5.259921e-16 2.902025e-16 -3.627532e-17 -3.627532e-16
std 1.000319e+00 1.000319 1.000319e+00 1.000319e+00 1.000319e+00 1.000319e+00 1.000319e+00 1.000319e+00 1.000319e+00 1.000319e+00
min -2.766770e+00 -3.048239 -2.668195e+00 -2.598893e+00 -3.326342e+00 -2.804903e+00 -2.762710e+00 -2.234737e+00 -2.504976e+00 -1.911839e+00
25% -6.251964e-01 -0.597686 -6.708041e-01 -7.809149e-01 -5.028984e-01 -6.402378e-01 -6.900538e-01 -7.603113e-01 -6.696210e-01 -7.176864e-01
50% -1.114003e-01 0.100060 2.097773e-02 2.298543e-02 9.193274e-02 -4.919240e-02 1.653343e-02 -1.805165e-02 -9.412606e-02 -1.389204e-01
75% 7.987121e-01 0.497567 6.100452e-01 5.953340e-01 3.892697e-01 7.117756e-01 6.642384e-01 7.245946e-01 5.765615e-01 5.785033e-01
max 2.455863e+00 3.453976 2.854026e+00 2.661488e+00 2.544963e+00 2.738357e+00 2.805198e+00 3.132479e+00 2.735286e+00 2.937623e+00
In [51]:
X_test_rus_scaled  = X_test_rus.apply(zscore)
X_test_rus_scaled.describe()
Out[51]:
28 59 64 122 129 133 200 295 316 460
count 5.200000e+01 5.200000e+01 5.200000e+01 5.200000e+01 5.200000e+01 5.200000e+01 5.200000e+01 5.200000e+01 5.200000e+01 5.200000e+01
mean -1.938620e-15 -8.807058e-17 4.056584e-17 -1.216975e-16 -8.273297e-18 -7.899664e-16 -6.138252e-16 3.373370e-16 3.031763e-16 -1.281027e-16
std 1.009756e+00 1.009756e+00 1.009756e+00 1.009756e+00 1.009756e+00 1.009756e+00 1.009756e+00 1.009756e+00 1.009756e+00 1.009756e+00
min -2.074250e+00 -2.499621e+00 -2.031822e+00 -1.935863e+00 -3.136782e+00 -2.338093e+00 -2.243822e+00 -1.953942e+00 -1.591972e+00 -1.565324e+00
25% -6.915126e-01 -3.941518e-01 -7.277035e-01 -7.033311e-01 -3.753652e-01 -6.707714e-01 -7.975893e-01 -5.687025e-01 -8.261444e-01 -6.713383e-01
50% -1.545828e-01 -1.045992e-01 -2.342004e-02 -1.041162e-01 1.690075e-01 -2.002836e-01 8.802193e-03 -1.504846e-01 -1.235402e-01 -1.725936e-01
75% 6.488227e-01 4.944892e-01 6.695891e-01 7.967360e-01 5.923753e-01 7.341302e-01 7.666664e-01 6.861950e-01 6.405214e-01 6.957530e-01
max 2.152662e+00 2.619401e+00 2.197226e+00 1.992324e+00 2.588252e+00 1.954345e+00 2.016993e+00 2.719871e+00 2.439800e+00 3.003254e+00

Analysis: (RandomOverSample,randomUnderSample,oriinal dataset)

  1. Standard deviation is almost similar. among test, train, andoriginal dataset.
  2. There is a differnce in test , train and original data set for other parameters.

5 Model training, testing and tuning

5.A Use any Supervised Learning technique to train a model

In [52]:
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer
def test_train_analysis(ytrain,ytest,predict_train,predict_test):
    train=performance_analysis(ytrain,predict_train)
    test=performance_analysis(ytest,predict_test)
    data= { 'train' : train,
            'test' :test
       }
    Name= ['Accuracy', 'Recall', 'Precision', 'F1-score',"roc_auc_score"]
    index=Name
    df2 = pd.DataFrame(data, index)
    df2.reset_index(inplace = True)
    display(df2)


def conf_metrix(y,pred):
    cm = metrics.confusion_matrix(y, pred, labels=[0, 1])
    df_cm = pd.DataFrame(cm, index = [i for i in ["Fail","Pass"]],
                  columns = [i for i in ["Fail","Pass"]])
    plt.figure(figsize = (7,5))
    sns.heatmap(df_cm, annot=True,fmt= 'd')
    plt.show()

def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
    lb = preprocessing.LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return roc_auc_score(y_test, y_pred, average=average)

def complete_analysis(X_train,X_test,y_train,y_test,ML):
  pred_train = ML.predict(X_train)
  pred_test = ML.predict(X_test)
  test_train_analysis(y_train,y_test,pred_train,pred_test)
  conf_metrix(y_test,pred_test )
  print("Classification report on training data=================================")
  print(classification_report(y_train,pred_train ))
  print("Classification report on test data=================================")
  print(classification_report(y_test,pred_test ))



def performance_analysis(a,b):
    q=[]
    q.append(accuracy_score(a, b))
    q.append(precision_score(a, b,average="macro"))
    q.append(recall_score(a, b,average="macro"))
    q.append(f1_score(a, b,average="macro"))
    q.append(multiclass_roc_auc_score(a, b, average="macro"))
    return q


def summary_table(models,xtrain,xtest,ytrain,ytest):
  df_S = pd.DataFrame(index=['Accuracy', 'Recall', 'Precision', 'F1-score',"roc_auc_score"])
  for i in models:
    x=performance_analysis(ytrain,i.predict(xtrain))
    df_S [str(i)[0:9]+"_Train"] = x
    y=performance_analysis(ytest,i.predict(xtest))
    df_S [str(i)[0:9]+"_Test"]=y
  return df_S
In [53]:
def dtree_view(ML,train_x):
    fn = list(train_x)
    cn = ['No', 'Yes']
    fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4, 4), dpi=300)
    plot_tree(ML, feature_names = fn, class_names=cn, filled = True)

def dtree_ml(xtrain,xtest,ytrain,ytest):
  dTree1 = DecisionTreeClassifier(random_state=1)
  dTree1.fit(xtrain, ytrain)
  complete_analysis(xtrain,xtest,ytrain,ytest,dTree1)
  print("Top 5 features are************************************************")
  feature_scores = pd.Series(dTree1.feature_importances_, index=xtrain.columns).sort_values(ascending=False)
  print(feature_scores.head())
  dtree_view(dTree1,xtrain)
  return dTree1


# def dtree_view(ML,train_x):
#     fn = list(train_x)
#     cn = ['No', 'Yes']
#     fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4, 4), dpi=300)
#     plot_tree(ML, feature_names = fn, class_names=cn, filled = True)

Performance analysis on RandomUnder sample dataset using decision tree.

In [54]:
dtree_ml(X_train_rus_scaled,X_test_rus_scaled,y_train_rus, y_test_rus)
index train test
0 Accuracy 1.0 0.557692
1 Recall 1.0 0.565476
2 Precision 1.0 0.567588
3 F1-score 1.0 0.556215
4 roc_auc_score 1.0 0.567588
Classification report on training data=================================
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        83
           1       1.00      1.00      1.00        73

    accuracy                           1.00       156
   macro avg       1.00      1.00      1.00       156
weighted avg       1.00      1.00      1.00       156

Classification report on test data=================================
              precision    recall  f1-score   support

           0       0.46      0.62      0.53        21
           1       0.67      0.52      0.58        31

    accuracy                           0.56        52
   macro avg       0.57      0.57      0.56        52
weighted avg       0.58      0.56      0.56        52

Top 5 features are************************************************
59     0.167229
460    0.143660
129    0.136141
316    0.125753
200    0.104427
dtype: float64
Out[54]:
DecisionTreeClassifier(random_state=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(random_state=1)

Performance analysis on RandomOver sample dataset using decision tree.

In [55]:
dtree_ml(X_train_ros_scaled,X_test_ros_scaled,y_train_ros, y_test_ros)
index train test
0 Accuracy 1.0 0.864754
1 Recall 1.0 0.879573
2 Precision 1.0 0.863678
3 F1-score 1.0 0.863160
4 roc_auc_score 1.0 0.863678
Classification report on training data=================================
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1093
           1       1.00      1.00      1.00      1101

    accuracy                           1.00      2194
   macro avg       1.00      1.00      1.00      2194
weighted avg       1.00      1.00      1.00      2194

Classification report on test data=================================
              precision    recall  f1-score   support

           0       0.81      0.96      0.88       370
           1       0.95      0.77      0.85       362

    accuracy                           0.86       732
   macro avg       0.88      0.86      0.86       732
weighted avg       0.88      0.86      0.86       732

Top 5 features are************************************************
59     0.171870
200    0.135276
64     0.125750
460    0.123556
122    0.092750
dtype: float64
Out[55]:
DecisionTreeClassifier(random_state=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(random_state=1)

Performance is better on decision tree with RandomOver sample dataset

5.B Use cross validation techniques.

In [56]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
In [57]:
def cross_validation(n_split,ml, xtrain, ytrain):
  kf = KFold(n_splits=n_split, shuffle=True, random_state=0)
  scores = cross_val_score(ml, xtrain, ytrain, cv=kf, scoring='accuracy')
  print("Cross-validation scoreson train data:", scores)
  mean_accuracy = np.mean(scores)
  print("Mean accuracy on traindata:", mean_accuracy)
  print("Standard Deviation:", np.std(scores))

Basic cross validation

In [58]:
dtree1= DecisionTreeClassifier(random_state=1)
# Cross Validation on random under sampler
print("****Cross Validation on random under sampler****")
cross_validation(5,dtree1,X_train_rus_scaled, y_train_rus)
print("****Cross Validation on random over sampler****")
# Cross Validation on random over sampler
cross_validation(5,dtree1,X_train_ros_scaled, y_train_ros)
****Cross Validation on random under sampler****
Cross-validation scoreson train data: [0.71875    0.51612903 0.5483871  0.48387097 0.67741935]
Mean accuracy on traindata: 0.5889112903225806
Standard Deviation: 0.09237391919221724
****Cross Validation on random over sampler****
Cross-validation scoreson train data: [0.92710706 0.97266515 0.9498861  0.95899772 0.95205479]
Mean accuracy on traindata: 0.9521421661934035
Standard Deviation: 0.0148327026890827

Stratified Kfold Using Scikit-Learn

In [59]:
from sklearn.model_selection import StratifiedKFold
In [60]:
def stratified_fold_fun(n_splits_count,ml,xtrain, ytrain):
  stratified_kfold = StratifiedKFold(n_splits=n_splits_count, shuffle=True, random_state=1)
  scores = cross_val_score(ml, xtrain, ytrain, cv=stratified_kfold, scoring='accuracy')
  print("Cross-validation scores:", scores)
  mean_accuracy = np.mean(scores)
  print("Mean accuracy:", mean_accuracy)
  print("Standard Deviation:", np.std(scores))
In [61]:
## Stratified Cross Validation on random under sampler
print("**** Stratified Cross Validation on random under sampler****")
stratified_fold_fun(5,dtree1,X_train_rus_scaled, y_train_rus)
## Stratified Cross Validation on random over sampler
print("**** Stratified Cross Validation on random over sampler****")
stratified_fold_fun(5,dtree1,X_train_ros_scaled, y_train_ros)
**** Stratified Cross Validation on random under sampler****
Cross-validation scores: [0.59375    0.58064516 0.58064516 0.64516129 0.74193548]
Mean accuracy: 0.6284274193548386
Standard Deviation: 0.061535217448810756
**** Stratified Cross Validation on random over sampler****
Cross-validation scores: [0.95216401 0.95899772 0.95216401 0.94305239 0.96575342]
Mean accuracy: 0.9544263113551971
Standard Deviation: 0.007600044221755339

LeaveOneOut

In [62]:
from sklearn.model_selection import LeaveOneOut
def loocv_fun(ml,xtrain, ytrain):
  loocv = LeaveOneOut()
  scores =cross_val_score(ml,xtrain, ytrain, cv=loocv,scoring='accuracy')
  mean_accuracy = np.mean(scores)
  print("Mean accuracy:", mean_accuracy)
  print("Standard Deviation:", np.std(scores))
In [63]:
dtree1= DecisionTreeClassifier(random_state=1)
## LOOCV Cross Validation on random under sampler
print("**** LOOCV Cross Validation on random under sampler****")
loocv_fun(dtree1,X_train_rus_scaled, y_train_rus)
## LOOCV Cross Validation on random over sampler
print("**** LOOCV Cross Validation on random over sampler****")
loocv_fun(dtree1,X_train_ros_scaled, y_train_ros)
**** LOOCV Cross Validation on random under sampler****
Mean accuracy: 0.6089743589743589
Standard Deviation: 0.4879801113632886
**** LOOCV Cross Validation on random over sampler****
Mean accuracy: 0.9658158614402917
Standard Deviation: 0.18170190761419916

Analysis:

  1. Accuracy is improved on LOOCV cross validation method for both Random Under sample dataset and Random Over sample dataset but the standard deviation is also very high in LOOCV.

  2. I will choose Stratified Cross Validation , as it has good accuracy mean with low standard deviation.

5.C. Apply hyper-parameter tuning techniques to get the best accuracy

In [64]:
def grid_search_dtree(xtrain,xtest,ytrain,ytest):
  param_grid = {'max_features': ['auto', 'sqrt', 'log2'],
                'ccp_alpha': [0.1, .01, .001],
                'max_depth' : list(range(1,20)),
                'criterion' :['gini', 'entropy',"log_loss"],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
               }
  dtree_grid = DecisionTreeClassifier(random_state=1)
  dtree_grid = GridSearchCV(estimator=dtree_grid, param_grid=param_grid, cv=5, verbose=True)
  dtree_grid .fit(xtrain, ytrain)
  print(dtree_grid.best_params_)
  print(dtree_grid.best_estimator_)
  complete_analysis(xtrain,xtest,ytrain,ytest,dtree_grid)
  dtree_view(dtree_grid.best_estimator_,xtrain)
  print("Top 5 features are************************************************")
  feature_scores = pd.Series(dtree_grid.best_estimator_.feature_importances_, index=xtrain.columns).sort_values(ascending=False)
  print(feature_scores.head())
  return dtree_grid
In [65]:
grid_search_dtree(X_train_ros_scaled, X_test_ros_scaled, y_train_ros, y_test_ros)
Fitting 5 folds for each of 4617 candidates, totalling 23085 fits
{'ccp_alpha': 0.001, 'criterion': 'entropy', 'max_depth': 19, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2}
DecisionTreeClassifier(ccp_alpha=0.001, criterion='entropy', max_depth=19,
                       max_features='auto', random_state=1)
index train test
0 Accuracy 1.0 0.837432
1 Recall 1.0 0.847419
2 Precision 1.0 0.836501
3 F1-score 1.0 0.835974
4 roc_auc_score 1.0 0.836501
Classification report on training data=================================
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1093
           1       1.00      1.00      1.00      1101

    accuracy                           1.00      2194
   macro avg       1.00      1.00      1.00      2194
weighted avg       1.00      1.00      1.00      2194

Classification report on test data=================================
              precision    recall  f1-score   support

           0       0.79      0.92      0.85       370
           1       0.90      0.75      0.82       362

    accuracy                           0.84       732
   macro avg       0.85      0.84      0.84       732
weighted avg       0.85      0.84      0.84       732

Top 5 features are************************************************
28     0.151516
59     0.130922
316    0.122585
200    0.105729
460    0.104343
dtype: float64
Out[65]:
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=1),
             param_grid={'ccp_alpha': [0.1, 0.01, 0.001],
                         'criterion': ['gini', 'entropy', 'log_loss'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10]},
             verbose=True)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=1),
             param_grid={'ccp_alpha': [0.1, 0.01, 0.001],
                         'criterion': ['gini', 'entropy', 'log_loss'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10]},
             verbose=True)
DecisionTreeClassifier(random_state=1)
DecisionTreeClassifier(random_state=1)
In [66]:
grid_search_dtree(X_train_rus_scaled, X_test_rus_scaled, y_train_rus, y_test_rus)
Fitting 5 folds for each of 4617 candidates, totalling 23085 fits
{'ccp_alpha': 0.01, 'criterion': 'gini', 'max_depth': 7, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10}
DecisionTreeClassifier(ccp_alpha=0.01, max_depth=7, max_features='auto',
                       min_samples_leaf=4, min_samples_split=10,
                       random_state=1)
index train test
0 Accuracy 0.833333 0.442308
1 Recall 0.835062 0.449405
2 Precision 0.835947 0.447773
3 F1-score 0.833306 0.440445
4 roc_auc_score 0.835947 0.447773
Classification report on training data=================================
              precision    recall  f1-score   support

           0       0.88      0.80      0.84        83
           1       0.79      0.88      0.83        73

    accuracy                           0.83       156
   macro avg       0.84      0.84      0.83       156
weighted avg       0.84      0.83      0.83       156

Classification report on test data=================================
              precision    recall  f1-score   support

           0       0.36      0.48      0.41        21
           1       0.54      0.42      0.47        31

    accuracy                           0.44        52
   macro avg       0.45      0.45      0.44        52
weighted avg       0.47      0.44      0.45        52

Top 5 features are************************************************
64     0.287066
200    0.180245
316    0.144408
59     0.133267
295    0.114281
dtype: float64
Out[66]:
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=1),
             param_grid={'ccp_alpha': [0.1, 0.01, 0.001],
                         'criterion': ['gini', 'entropy', 'log_loss'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10]},
             verbose=True)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=1),
             param_grid={'ccp_alpha': [0.1, 0.01, 0.001],
                         'criterion': ['gini', 'entropy', 'log_loss'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10]},
             verbose=True)
DecisionTreeClassifier(random_state=1)
DecisionTreeClassifier(random_state=1)
In [67]:
from sklearn.model_selection import RandomizedSearchCV
In [68]:
def random_search_dtree(xtrain,xtest,ytrain,ytest):
  param_grid = {'max_features': ['auto', 'sqrt', 'log2'],
                'ccp_alpha': [0.1, .01, .001],
                'max_depth' : list(range(1,20)),
                'criterion' :['gini', 'entropy',"log_loss"],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
               }
  dtree_rm = DecisionTreeClassifier(random_state=1)
  dtree_rm = RandomizedSearchCV(estimator=dtree_rm, param_distributions=param_grid, cv=5, verbose=True)
  dtree_rm .fit(xtrain, ytrain)
  print(dtree_rm.best_params_)
  print(dtree_rm.best_estimator_)
  complete_analysis(xtrain,xtest,ytrain,ytest,dtree_rm)
  dtree_view(dtree_rm.best_estimator_,xtrain)
  print("Top 5 features are************************************************")
  feature_scores = pd.Series(dtree_rm.best_estimator_.feature_importances_, index=xtrain.columns).sort_values(ascending=False)
  print(feature_scores.head())
  return dtree_rm
In [69]:
random_search_dtree(X_train_ros_scaled, X_test_ros_scaled, y_train_ros, y_test_ros)
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 18, 'criterion': 'entropy', 'ccp_alpha': 0.001}
DecisionTreeClassifier(ccp_alpha=0.001, criterion='entropy', max_depth=18,
                       max_features='log2', min_samples_split=10,
                       random_state=1)
index train test
0 Accuracy 0.994075 0.834699
1 Recall 0.994165 0.845269
2 Precision 0.994053 0.833739
3 F1-score 0.994074 0.833130
4 roc_auc_score 0.994053 0.833739
Classification report on training data=================================
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1093
           1       0.99      1.00      0.99      1101

    accuracy                           0.99      2194
   macro avg       0.99      0.99      0.99      2194
weighted avg       0.99      0.99      0.99      2194

Classification report on test data=================================
              precision    recall  f1-score   support

           0       0.79      0.92      0.85       370
           1       0.90      0.75      0.82       362

    accuracy                           0.83       732
   macro avg       0.85      0.83      0.83       732
weighted avg       0.84      0.83      0.83       732

Top 5 features are************************************************
200    0.154357
59     0.150559
129    0.116408
28     0.107659
460    0.100061
dtype: float64
Out[69]:
RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=1),
                   param_distributions={'ccp_alpha': [0.1, 0.01, 0.001],
                                        'criterion': ['gini', 'entropy',
                                                      'log_loss'],
                                        'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                      10, 11, 12, 13, 14, 15,
                                                      16, 17, 18, 19],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10]},
                   verbose=True)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=1),
                   param_distributions={'ccp_alpha': [0.1, 0.01, 0.001],
                                        'criterion': ['gini', 'entropy',
                                                      'log_loss'],
                                        'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                      10, 11, 12, 13, 14, 15,
                                                      16, 17, 18, 19],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10]},
                   verbose=True)
DecisionTreeClassifier(random_state=1)
DecisionTreeClassifier(random_state=1)
In [70]:
random_search_dtree(X_train_rus_scaled, X_test_rus_scaled, y_train_rus, y_test_rus)
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 5, 'criterion': 'entropy', 'ccp_alpha': 0.001}
DecisionTreeClassifier(ccp_alpha=0.001, criterion='entropy', max_depth=5,
                       max_features='sqrt', random_state=1)
index train test
0 Accuracy 0.826923 0.538462
1 Recall 0.826137 0.550225
2 Precision 0.826622 0.551459
3 F1-score 0.826345 0.537778
4 roc_auc_score 0.826622 0.551459
Classification report on training data=================================
              precision    recall  f1-score   support

           0       0.84      0.83      0.84        83
           1       0.81      0.82      0.82        73

    accuracy                           0.83       156
   macro avg       0.83      0.83      0.83       156
weighted avg       0.83      0.83      0.83       156

Classification report on test data=================================
              precision    recall  f1-score   support

           0       0.45      0.62      0.52        21
           1       0.65      0.48      0.56        31

    accuracy                           0.54        52
   macro avg       0.55      0.55      0.54        52
weighted avg       0.57      0.54      0.54        52

Top 5 features are************************************************
316    0.245399
59     0.173167
200    0.152714
28     0.112035
129    0.106595
dtype: float64
Out[70]:
RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=1),
                   param_distributions={'ccp_alpha': [0.1, 0.01, 0.001],
                                        'criterion': ['gini', 'entropy',
                                                      'log_loss'],
                                        'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                      10, 11, 12, 13, 14, 15,
                                                      16, 17, 18, 19],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10]},
                   verbose=True)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=1),
                   param_distributions={'ccp_alpha': [0.1, 0.01, 0.001],
                                        'criterion': ['gini', 'entropy',
                                                      'log_loss'],
                                        'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                      10, 11, 12, 13, 14, 15,
                                                      16, 17, 18, 19],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10]},
                   verbose=True)
DecisionTreeClassifier(random_state=1)
DecisionTreeClassifier(random_state=1)

Analysis:

  1. RandomSearchCV is faster than GridsearchCV.
  2. esults are improved on RandomSearchCV.

Use any other technique/method which can enhance the model performance Display and explain the classification report in detail.

In [71]:
pca=PCA(n_components=10,random_state=10)
pca_model=pca.fit_transform(X_train_ros_scaled)
#calculate the variance
var_explained_per=pca.explained_variance_/np.sum(pca.explained_variance_)
print("Variance_explained_variance=",var_explained_per)
#Cummulative Sum
cum_var_explained=np.cumsum(var_explained_per)
print("Cummuative_vaiance_explained=",cum_var_explained)
plt.plot(cum_var_explained,marker='*',markerfacecolor='black', markersize=8)
plt.axhline(y = .90)
plt.xlabel('n_components')
plt.ylabel('Cummuative_vaiance_explained')
plt.show()
Variance_explained_variance= [0.16114244 0.12744209 0.11604669 0.10748442 0.09961551 0.09512159
 0.08998848 0.07857    0.06506174 0.05952703]
Cummuative_vaiance_explained= [0.16114244 0.28858453 0.40463122 0.51211564 0.61173115 0.70685274
 0.79684123 0.87541122 0.94047297 1.        ]
In [72]:
pca=PCA(n_components=7, random_state=10)
pca_model_train=pca.fit_transform(X_train_ros_scaled)
In [73]:
def data_analysis(y,predicted_x):
    result=performance_analysis(y,predicted_x)
    data= { 'performance' : result,
                  }
    Name= ['Accuracy', 'Recall', 'Precision', 'F1-score',"roc_auc_score"]
    index=Name
    df2 = pd.DataFrame(data, index)
    df2.reset_index(inplace = True)
    display(df2)
In [74]:
dtree_pca = DecisionTreeClassifier(random_state=1)
def pca_fun(ML,xtrain_scaled,ytrain,xtest,ytest):
  pca=PCA(n_components=7, random_state=10)
  pca_model_train=pca.fit_transform(xtrain_scaled)

  ML.fit(pca_model_train,ytrain)
  ML_model = ML.predict(pca_model_train)
  print(classification_report(ytrain,ML_model))
  data_analysis(ytrain,ML_model)
  conf_metrix(ytrain,ML_model)
  print("############")

  pca_model_test=pca.transform(xtest)
  dtree_pca_model2 = ML.predict(pca_model_test)
  print(classification_report(ytest,dtree_pca_model2))
  data_analysis(ytest,dtree_pca_model2)
  conf_metrix(ytest,dtree_pca_model2)
In [75]:
pca_fun(dtree_pca,X_train_ros_scaled,y_train_ros,X_test_ros_scaled,y_test_ros)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1093
           1       1.00      1.00      1.00      1101

    accuracy                           1.00      2194
   macro avg       1.00      1.00      1.00      2194
weighted avg       1.00      1.00      1.00      2194

index performance
0 Accuracy 1.0
1 Recall 1.0
2 Precision 1.0
3 F1-score 1.0
4 roc_auc_score 1.0
############
              precision    recall  f1-score   support

           0       0.71      0.95      0.81       370
           1       0.92      0.60      0.72       362

    accuracy                           0.77       732
   macro avg       0.81      0.77      0.77       732
weighted avg       0.81      0.77      0.77       732

index performance
0 Accuracy 0.773224
1 Recall 0.810450
2 Precision 0.771316
3 F1-score 0.765361
4 roc_auc_score 0.771316
In [76]:
pca_fun(dtree_pca,X_train_rus_scaled,y_train_rus,X_test_rus_scaled,y_test_rus)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        83
           1       1.00      1.00      1.00        73

    accuracy                           1.00       156
   macro avg       1.00      1.00      1.00       156
weighted avg       1.00      1.00      1.00       156

index performance
0 Accuracy 1.0
1 Recall 1.0
2 Precision 1.0
3 F1-score 1.0
4 roc_auc_score 1.0
############
              precision    recall  f1-score   support

           0       0.48      0.62      0.54        21
           1       0.68      0.55      0.61        31

    accuracy                           0.58        52
   macro avg       0.58      0.58      0.57        52
weighted avg       0.60      0.58      0.58        52

index performance
0 Accuracy 0.576923
1 Recall 0.580741
2 Precision 0.583717
3 F1-score 0.574405
4 roc_auc_score 0.583717

Classification report Analysis:

  1. On trained data , the performance is good.
  2. On test data performance is better on random Over sample dataset. Precision is better for pass test and recall is better for failed test. Over all there is a scope of improvement.

Apply the above steps for all possible models that you have learnt so far.

In [77]:
def stratified_fold_fun2(n_splits_count,ml,xtrain, ytrain):
  stratified_kfold = StratifiedKFold(n_splits=n_splits_count, shuffle=True, random_state=1)
  scores = cross_val_score(ml, xtrain, ytrain, cv=stratified_kfold, scoring='accuracy')

  mean_accuracy = np.mean(scores)

  return mean_accuracy
In [78]:
models = []
models.append(("LR", LogisticRegression()))
models.append(("KNN", KNeighborsClassifier()))
models.append(("SVM", SVC(kernel='sigmoid')))
models.append(("DT", DecisionTreeClassifier()))
models.append(("RF", RandomForestClassifier()))
models.append(("AB", AdaBoostClassifier()))
models.append(("GBT", GradientBoostingClassifier()))


df_S = pd.DataFrame(columns=['Model', 'mean accuracy_train_RUS','mean accuracy_train_ROS'])
#testing models
results = []
names = []



for name, model in models:
    a=stratified_fold_fun2(5,model,X_train_rus_scaled, y_train_rus)
    results.append(a)
    names.append(name)
    mean_accuracy_rus = a.mean()

    b = stratified_fold_fun2(5, model, X_train_ros_scaled, y_train_ros)
    mean_accuracy_ros = b.mean()

    df_S = df_S.append({'Model': name, 'mean accuracy_train_RUS': mean_accuracy_rus, 'mean accuracy_train_ROS': mean_accuracy_ros}, ignore_index=True)

# Display the DataFrame
df_S
Out[78]:
Model mean accuracy_train_RUS mean accuracy_train_ROS
0 LR 0.640323 0.651309
1 KNN 0.538105 0.889247
2 SVM 0.634274 0.519572
3 DT 0.621976 0.957615
4 RF 0.633669 0.986783
5 AB 0.551008 0.820427
6 GBT 0.570363 0.914311
In [79]:
def summary_table2(models,x,y):
  df_S1 = pd.DataFrame(index=['Accuracy', 'Recall', 'Precision', 'F1-score',"roc_auc_score"])
  for i in models:
    y=performance_analysis(y,i.predict(x))
    df_S1 [str(i)[0:9]+"result"]=y
  return df_S1

6.A. Display and compare all the models designed with their train and test accuracies.

In [80]:
print("*****************PCA on multiple algos o Random Over Sample TEST  data set")
models = [
    ("LR", LogisticRegression(random_state=1)),
    ("KNN", KNeighborsClassifier()),
    ("SVM", SVC(kernel='sigmoid')),
    ("DT", DecisionTreeClassifier(random_state=1)),
    ("RF", RandomForestClassifier(random_state=1)),
    ("AB", AdaBoostClassifier(random_state=1)),
    ("GBT", GradientBoostingClassifier(random_state=1))
]

# Create a DataFrame to store the results
df_S = pd.DataFrame(columns=['Model', 'Accuracy', 'Recall', 'Precision', 'F1-score', 'roc_auc_score'])

# Define a function for PCA and model training
def pca_fun_test(ML, xtrain_scaled, ytrain, xtest, ytest):
    pca = PCA(n_components=7, random_state=1)
    pca_model_train = pca.fit_transform(xtrain_scaled)
    ML.fit(pca_model_train, ytrain)
    pca_model_test = pca.transform(xtest)
    ML_model = ML.predict(pca_model_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(ytest, ML_model)
    recall = recall_score(ytest, ML_model)
    precision = precision_score(ytest, ML_model)
    f1 = f1_score(ytest, ML_model)
    roc_auc = roc_auc_score(ytest, ML_model)

    return [accuracy, recall, precision, f1, roc_auc]

def pca_fun_train(ML, xtrain_scaled, ytrain, xtest, ytest):
    pca = PCA(n_components=7, random_state=10)
    pca_model_train = pca.fit_transform(xtrain_scaled)
    ML.fit(pca_model_train, ytrain)
    ML_model = ML.predict(pca_model_train)


    # Calculate evaluation metrics
    accuracy = accuracy_score(ytrain, ML_model)
    recall = recall_score(ytrain, ML_model)
    precision = precision_score(ytrain, ML_model)
    f1 = f1_score(ytrain, ML_model)
    roc_auc = roc_auc_score(ytrain, ML_model)

    return [accuracy, recall, precision, f1, roc_auc]
# Testing models
results = []
names = []

for name, model in models:
    result = pca_fun_test(model,X_train_ros_scaled,y_train_ros,X_test_ros_scaled,y_test_ros)
    results.append(result)
    names.append(name)

# Append the results to the DataFrame
df_S['Model'] = names
df_S[['Accuracy', 'Recall', 'Precision', 'F1-score', 'roc_auc_score']] = results

# Display the DataFrame
print(df_S)
#
results.clear()
names.clear()

print("*****************PCA on multiple algos o Random Under Search on Test dataset")

df_S2 = pd.DataFrame(columns=['Model', 'Accuracy', 'Recall', 'Precision', 'F1-score', 'roc_auc_score'])

for name, model in models:
    result = pca_fun_test(model,X_train_rus_scaled,y_train_rus,X_test_rus_scaled,y_test_rus)
    results.append(result)
    names.append(name)

# Append the results to the DataFrame
df_S2['Model'] = names
df_S2[['Accuracy', 'Recall', 'Precision', 'F1-score', 'roc_auc_score']] = results

# Display the DataFrame
print(df_S2)

print("*****************PCA on multiple algos on Random Over Search on Train dataset")
results.clear()
names.clear()
df_S3 = pd.DataFrame(columns=['Model', 'Accuracy', 'Recall', 'Precision', 'F1-score', 'roc_auc_score'])

for name, model in models:
    result = pca_fun_train(model,X_train_rus_scaled,y_train_rus,X_test_rus_scaled,y_test_rus)
    results.append(result)
    names.append(name)

# Append the results to the DataFrame
df_S3['Model'] = names
df_S3[['Accuracy', 'Recall', 'Precision', 'F1-score', 'roc_auc_score']] = results

# Display the DataFrame
print(df_S3)



print("*****************PCA on multiple algos on Random Over Search on Train dataset")
results.clear()
names.clear()
df_S4 = pd.DataFrame(columns=['Model', 'Accuracy', 'Recall', 'Precision', 'F1-score', 'roc_auc_score'])

for name, model in models:
    result = pca_fun_train(model,X_train_ros_scaled,y_train_ros,X_test_ros_scaled,y_test_ros)
    results.append(result)
    names.append(name)

# Append the results to the DataFrame
df_S4['Model'] = names
df_S4[['Accuracy', 'Recall', 'Precision', 'F1-score', 'roc_auc_score']] = results

# Display the DataFrame
print(df_S4)
*****************PCA on multiple algos o Random Over Sample TEST  data set
  Model  Accuracy    Recall  Precision  F1-score  roc_auc_score
0    LR  0.659836  0.676796   0.649867  0.663058       0.660019
1   KNN  0.894809  1.000000   0.824601  0.903870       0.895946
2   SVM  0.545082  0.511050   0.542522  0.526316       0.544714
3    DT  0.773224  0.596685   0.915254  0.722408       0.771316
4    RF  0.931694  0.864641   0.996815  0.926036       0.930969
5    AB  0.733607  0.676796   0.758514  0.715328       0.732992
6   GBT  0.763661  0.704420   0.794393  0.746706       0.763021
*****************PCA on multiple algos o Random Under Search on Test dataset
  Model  Accuracy    Recall  Precision  F1-score  roc_auc_score
0    LR  0.653846  0.548387   0.809524  0.653846       0.678955
1   KNN  0.461538  0.483871   0.555556  0.517241       0.456221
2   SVM  0.653846  0.516129   0.842105  0.640000       0.686636
3    DT  0.576923  0.548387   0.680000  0.607143       0.583717
4    RF  0.576923  0.516129   0.695652  0.592593       0.591398
5    AB  0.500000  0.387097   0.631579  0.480000       0.526882
6   GBT  0.615385  0.580645   0.720000  0.642857       0.623656
*****************PCA on multiple algos on Random Over Search on Train dataset
  Model  Accuracy    Recall  Precision  F1-score  roc_auc_score
0    LR  0.679487  0.616438   0.671642  0.642857       0.675689
1   KNN  0.679487  0.698630   0.645570  0.671053       0.680640
2   SVM  0.589744  0.547945   0.563380  0.555556       0.587226
3    DT  1.000000  1.000000   1.000000  1.000000       1.000000
4    RF  1.000000  1.000000   1.000000  1.000000       1.000000
5    AB  0.916667  0.890411   0.928571  0.909091       0.915085
6   GBT  1.000000  1.000000   1.000000  1.000000       1.000000
*****************PCA on multiple algos on Random Over Search on Train dataset
  Model  Accuracy    Recall  Precision  F1-score  roc_auc_score
0    LR  0.667730  0.683015   0.664311  0.673533       0.667674
1   KNN  0.927074  1.000000   0.873117  0.932261       0.926807
2   SVM  0.520966  0.495005   0.524038  0.509108       0.521061
3    DT  1.000000  1.000000   1.000000  1.000000       1.000000
4    RF  1.000000  1.000000   1.000000  1.000000       1.000000
5    AB  0.813582  0.833787   0.802448  0.817817       0.813509
6   GBT  0.941203  0.988193   0.903654  0.944035       0.941031

Using only RandomOver Sample dataset and will perform pca+grid search

  1. The best trained results are found on random oversampe dataset.
  2. The best tested/trained model is Randomforest.
  3. RFon test data.922131 0.845304 0.996743 0.914798 0.921301
  4. RF on trained dataset 1.000000 1.000000 1.000000 1.000000 1.000000
  5. All the parameters are very impressive.
In [81]:
models = [
   ("KNN", KNeighborsClassifier()),
   ("SVM", SVC(kernel='sigmoid')),
   ("DT", DecisionTreeClassifier(random_state=1)),
   ("RF", RandomForestClassifier(random_state=1)),
   ("AB", AdaBoostClassifier(random_state=1)),
   ("GBT", GradientBoostingClassifier(random_state=1))
]

pca = PCA(n_components=7, random_state=1)

results_train = {
    "Model": [],
    "Accuracy": [],
    "Recall": [],
    "Precision": [],
    "F1-score": [],
    "roc_auc_score": []
}

results_test = {
    "Model": [],
    "Accuracy": [],
    "Recall": [],
    "Precision": [],
    "F1-score": [],
    "roc_auc_score": []
}

for name, model in models:

    X_pca_train = pca.fit_transform(X_train_ros_scaled)


    param_grids = {
        "KNN": [
            {
                "n_neighbors": [3, 5, 7, 9],
                "weights": ["uniform", "distance"]
            }
        ],
        "SVM": [
            {
                "C": [0.01, 0.1, 1],
                "kernel": ["linear", "poly", "rbf", "sigmoid"],
                "gamma": [ 0.01, 0.1, 1]
            }
        ],
        "DT": [
            {
                "criterion": ["gini", "entropy"],
                "max_depth": [10, 20, 30, 40],
                "min_samples_split": [2, 5, 10],
                "min_samples_leaf": [1, 2, 4]
            }
        ],
        "RF": [
            {
                "n_estimators": [100, 200],
                "criterion": ["gini", "entropy"],
                "max_depth": [10, 20, 30],
                "min_samples_split": [2, 5, 10],
                "min_samples_leaf": [1, 2, 4]
            }
        ],
        "AB": [
            {
                "n_estimators": [50, 100],
                "learning_rate": [0.01, 0.1, 1.0]
            }
        ],
        "GBT": [
            {
                "n_estimators": [50, 100, 200],
                "learning_rate": [0.01, 0.1, 0.5]
            }
        ]
    }

    grid_search = GridSearchCV(model, param_grids[name], cv=5)
    grid_search.fit(X_pca_train, y_train_ros)

    best_model = grid_search.best_estimator_

    accuracy_train = cross_val_score(best_model, X_pca_train, y_train_ros, cv=5, scoring='accuracy').mean()
    recall_train = cross_val_score(best_model, X_pca_train, y_train_ros, cv=5, scoring='recall').mean()
    precision_train = cross_val_score(best_model, X_pca_train, y_train_ros, cv=5, scoring='precision').mean()
    f1_train = cross_val_score(best_model, X_pca_train, y_train_ros, cv=5, scoring='f1').mean()
    roc_auc_train = cross_val_score(best_model, X_pca_train, y_train_ros, cv=5, scoring='roc_auc').mean()

    # Apply PCA to the test data
    X_pca_test = pca.transform(X_test_ros_scaled)
    y_pred_test = best_model.predict(X_pca_test)

    accuracy_test = accuracy_score(y_test_ros, y_pred_test)
    recall_test = recall_score(y_test_ros, y_pred_test)
    precision_test = precision_score(y_test_ros, y_pred_test)
    f1_test = f1_score(y_test_ros, y_pred_test)
    roc_auc_test = roc_auc_score(y_test_ros, y_pred_test)

    results_train["Model"].append(name)
    results_train["Accuracy"].append(accuracy_train)
    results_train["Recall"].append(recall_train)
    results_train["Precision"].append(precision_train)
    results_train["F1-score"].append(f1_train)
    results_train["roc_auc_score"].append(roc_auc_train)

    results_test["Model"].append(name)
    results_test["Accuracy"].append(accuracy_test)
    results_test["Recall"].append(recall_test)
    results_test["Precision"].append(precision_test)
    results_test["F1-score"].append(f1_test)
    results_test["roc_auc_score"].append(roc_auc_test)

results_train_df = pd.DataFrame(results_train)
results_test_df = pd.DataFrame(results_test)

print("Results for Train Data:")
print(results_train_df)
print("\nResults for Test Data:")
print(results_test_df)
Results for Train Data:
  Model  Accuracy    Recall  Precision  F1-score  roc_auc_score
0   KNN  0.908387  1.000000   0.845755  0.916393       0.947843
1   SVM  0.988603  1.000000   0.977867  0.988792       1.000000
2    DT  0.954418  1.000000   0.917510  0.956790       0.955960
3    RF  0.989056  1.000000   0.978828  0.989259       1.000000
4    AB  0.832741  0.872834   0.808815  0.839350       0.908117
5   GBT  0.965361  1.000000   0.935931  0.966775       0.997194

Results for Test Data:
  Model  Accuracy    Recall  Precision  F1-score  roc_auc_score
0   KNN  0.924863  1.000000   0.868106  0.929397       0.925676
1   SVM  0.990437  1.000000   0.981030  0.990424       0.990541
2    DT  0.758197  0.582873   0.890295  0.704508       0.756301
3    RF  0.927596  0.859116   0.993610  0.921481       0.926855
4    AB  0.733607  0.660221   0.768489  0.710253       0.732813
5   GBT  0.748634  0.541436   0.915888  0.680556       0.746394

Post Training and Conclusion

6.B Select the final best trained model along with your detailed comments for selecting this model.

Analysis:

  1. The best trained model on PCA+GridsearchCV is SVM and RF. Thought without gridsearchcv results are better on RandomForest model.
  2. The best test model on PCA+GridSearchCV is SVM and RF .

6.C Pickle the selected model for future use.

In [82]:
def model2():
  rfc=RandomForestClassifier()
  pca_fun_test(rfc,X_train_ros_scaled,y_train_ros,X_test_ros_scaled,y_test_ros)
  pca = PCA(n_components=7, random_state=10)
  pca_model_train = pca.fit_transform(X_train_ros_scaled)
  rfc=rfc.fit(pca_model_train, y_train_ros)
  pca_model_test = pca.transform(X_test_ros_scaled)
  rfc.predict(pca_model_test)
  return rfc
In [83]:
with open('rfc_model.pkl', 'wb') as model_file:
    pickle.dump(model2, model_file)

with open('rfc_model.pkl', 'rb') as model_file:
    loaded_rfc = pickle.load(model_file)

D. Write your conclusion on the results.

  1. The data set contains many outliers.
  2. The models shows best performance on randoover sample technique.
  3. All the parameters are improved on RandomForest also.
  4. Accuracy is improved on LOOCV cross validation method for both Random Under sample dataset and Random Over sample dataset but the standard deviation is also very high in LOOCV. I will choose Stratified Cross Validation , as it has good accuracy mean with low standard deviation.